Setup

Load libraries

library(ggplot2)
library(tidyr)
library(dplyr)
library(reshape2)

# For ARIMA
library(astsa)
library(forecast)

# For SIR
library(deSolve)
library(dfoptim)

Import data

import_data <- function(file) {
    df <- read.csv(file, header = FALSE, skip = 1)
    locations <- as.matrix(df[,1])
    dates <- as.matrix(read.csv(file, header = FALSE, nrows = 1)[1,-1])
    df <- as.data.frame(t(df[,-1]))
    rownames(df) <- dates
    colnames(df) <- locations
    # print(sapply(df, class))
    return(df)
}

cases_intl      <- import_data("data/International/International_covid_cases_data.csv")
deaths_intl     <- import_data("data/International/International_covid_deaths_data.csv")
cases_LA        <- import_data("data/LA/LA_cities_covid_data.csv")
cases_US        <- import_data("data/US_Counties/US_county_covid_cases_data.csv")
deaths_US       <- import_data("data/US_Counties/US_county_covid_deaths_data.csv")

Plot Data

Total Cases Per Day

overall_df <- function(df) {
    return(data.frame(days=seq(0,length(rownames(df))-1), 
                      cases=rowSums(df[,-1])))
}

overall_plot <- function(df, title) {
    ggplot(data=df, aes(x=days, y=cases, group=1)) + 
        geom_line() +
        ggtitle(title)
}

cases_LA_overall <- overall_df(cases_LA)
cases_US_overall <- overall_df(cases_US)
cases_intl_overall <- overall_df(cases_intl)

overall_plot(cases_LA_overall, "Overall LA Cases")

overall_plot(cases_US_overall, "Overall US Cases")

overall_plot(cases_intl_overall, "Overall International Cases")

Locations Plotted Individually

plot_all <- function(df, title, legend = FALSE) {
    rownames(df) <- seq(0,length(rownames(df))-1)
    m <- melt(as.matrix(df))
    colnames(m) <- c("Days", "Location", "Count")
    ggplot(m, aes(x=Days, y=Count, color=Location)) +
        geom_line(show.legend = legend) +
        scale_colour_viridis_d() +
        ggtitle(title)
}

max_counts <- function(df) {
    rownames(df) <- seq(0,length(rownames(df))-1)
    m <- melt(as.matrix(df))
    colnames(m) <- c("Days", "Location", "Count")
    return(m %>% group_by(Location) %>% slice_max(n=1, order_by=Count, with_ties = FALSE) %>% ungroup())
}
plot_all(cases_intl, "International Cases")

cases_intl_max <- max_counts(cases_intl)
cases_intl_max <- cases_intl_max[order(cases_intl_max$Count, decreasing = TRUE),]
cases_intl_max
## # A tibble: 188 x 3
##     Days Location         Count
##    <int> <fct>            <int>
##  1   176 US             3576157
##  2   176 Brazil         2012151
##  3   176 India          1003832
##  4   176 Russia          751612
##  5   176 Peru            341586
##  6   176 South Africa    324221
##  7   176 Mexico          324041
##  8   176 Chile           323698
##  9   176 United Kingdom  294116
## 10   176 Iran            267061
## # … with 178 more rows
plot_all(cases_US, "US County Cases")

cases_US_max <- max_counts(cases_US)
cases_US_max <- cases_US_max[order(cases_US_max$Count, decreasing = TRUE),]
cases_US_max
## # A tibble: 3,209 x 3
##     Days Location                 Count
##    <int> <fct>                    <int>
##  1   174 New York City, New York 223977
##  2   174 Los Angeles, California 136129
##  3   174 Cook, Illinois           95884
##  4   174 Maricopa, Arizona        81216
##  5   174 Miami-Dade, Florida      67712
##  6   174 Harris, Texas            47369
##  7   174 Nassau, New York         42354
##  8   174 Suffolk, New York        42112
##  9   174 Westchester, New York    35326
## 10   174 Dallas, Texas            34914
## # … with 3,199 more rows
plot_all(cases_LA, "LA Cases")

cases_LA_max <- max_counts(cases_LA)
cases_LA_max <- cases_LA_max[order(cases_LA_max$Count, decreasing = TRUE),]
cases_LA_max
## # A tibble: 339 x 3
##     Days Location                            Count
##    <int> <fct>                               <int>
##  1   104 Unincorporated - East Los Angeles    3327
##  2   104 City of South Gate                   2432
##  3   104 Los Angeles - Boyle Heights*         2375
##  4   104 City of Downey                       2349
##  5   104 City of Pomona                       2292
##  6   104 City of El Monte                     2244
##  7   104 City of Compton                      2049
##  8   104 Unincorporated - Castaic*            1822
##  9   104 City of Lynwood*                     1812
## 10   104 Unincorporated - Florence-Firestone  1804
## # … with 329 more rows

Locations with Most COVID-19 Cases and Other Selected Plots

plot_all(select(cases_intl, cases_intl_max$Location[1:10]), "International Cases Top 10", legend=TRUE)

plot_all(select(cases_US, cases_US_max$Location[1:10]), "US County Cases Top 10", legend=TRUE)

# locations with nice curves
# plot_all(select(cases_US, `New York City, New York`, `Suffolk, New York`, `Cook, Illinois`), "US County Cases Selected Plots", legend = TRUE)

# Los Angeles
# plot_all(select(cases_US, `Los Angeles, California`), "Los Angeles, California")
plot_all(select(cases_LA, cases_LA_max$Location[1:10]), "LA Cases Top 10", legend=TRUE)

Analyze with ARIMA

arima_wrap <- function(df, location, percentage = 0.7, verbose = TRUE) {
    timeseries = unlist(as.list(select(df, location)))
    if(verbose) {
        plot.ts(timeseries)
        plot.ts(diff(timeseries))
        acf(diff(timeseries))
    }
    
    train_series <- timeseries[1:length(timeseries)*percentage]
    test_series <- timeseries[-1:-length(timeseries)*percentage]
    
    # parametrize model
    AutoArimaModel=auto.arima(train_series)
    AutoArimaModel
    
    # train
    futurVal <- forecast(AutoArimaModel,h=10, level=c(99.5)) 
    futurVal <- forecast(AutoArimaModel,h=50, level=c(80)) #adjust timesteps ahead, and confidence level
    
    checkresiduals(AutoArimaModel)
    plot(futurVal)
    return(futurVal)
}
futurSuffolk <- arima_wrap(cases_US, "Suffolk, New York")
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(location)` instead of `location` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,2,0)
## Q* = 95.378, df = 7, p-value < 2.2e-16
## 
## Model df: 3.   Total lags used: 10

futurLA <- arima_wrap(cases_LA_overall, "cases")

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,2,1)
## Q* = 22.686, df = 6, p-value = 0.0009089
## 
## Model df: 4.   Total lags used: 10

futurUS <- arima_wrap(cases_US_overall, "cases")

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,2,0)
## Q* = 120.98, df = 5, p-value < 2.2e-16
## 
## Model df: 5.   Total lags used: 10

futurIntl <- arima_wrap(cases_intl_overall, "cases")

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,2,2)
## Q* = 70.454, df = 6, p-value = 3.3e-13
## 
## Model df: 4.   Total lags used: 10

forecasts = c(2,7,30)
futur <- list(futurSuffolk$mean[forecasts], futurUS$mean[forecasts], futurLA$mean[forecasts], futurIntl$mean[forecasts])
futur <- data.frame(matrix(unlist(futur), nrow=length(futur), byrow=T),stringsAsFactors=FALSE)
rownames(futur) <- c("Suffolk, New York", "All LA Cities", "All US Counties", "All International")
colnames(futur) <- c("2 Days", "1 Week", "1 Month")
futur
##                       2 Days     1 Week    1 Month
## Suffolk, New York   38666.99   39054.25   40752.36
## All LA Cities     1611784.46 1703922.52 2084156.34
## All US Counties     66611.48   71241.45   91621.37
## All International 5511041.30 5826711.49 7442012.61

Analyze with SIR

Infected <- cases_US$`Los Angeles, California`[-1:-25] # Suffolk, New York
Days <- seq(1,length(Infected))

#plot(Days, Infected)

SIR <- function(time, state, parameters) {
  par <- as.list(c(state, parameters))
  with(par, {
    dS <- -beta * I * S / N
    dI <- beta * I * S / N - gamma * I
    dR <- gamma * I
    list(c(dS, dI, dR))
  })
}

N <- 3.99e6 #1.477e6
init <- c(
  S = N - Infected[1],
  I = Infected[1],
  R = 0
)

# define a function to calculate the residual sum of squares
# (RSS), passing in parameters beta and gamma that are to be
# optimised for the best fit to the incidence data
RSS <- function(parameters) {
  names(parameters) <- c("beta", "gamma")
  out <- ode(y = init, times = Days, func = SIR, parms = parameters)
  fit <- out[, 3]
  sum((Infected - fit)^2)
}
# now find the values of beta and gamma that give the
# smallest RSS, which represents the best fit to the data.
# Start with values of 0.5 for each, and constrain them to
# the interval 0 to 1.0

lower = c(0, 0)
upper = c(1, 1)

Opt <- optim(c(0.5, 0.5),
  RSS,
  method = "L-BFGS-B",
  lower = lower,
  upper = upper
)
Opt$message
## [1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"
# Opt <-nmkb(c(0.5, 0.5), RSS, lower=lower, upper=upper)
# Opt$message

# library(Rvmmin)
# Opt <- Rvmmin(c(0.5, 0.5), RSS, lower=lower, upper=upper)
# Opt$message

Opt_par <- setNames(Opt$par, c("beta", "gamma"))
Opt_par
##      beta     gamma 
## 0.4079545 0.3115256
Opt_par[1] = 0.38
Opt_par[2] = 0.29
# get the fitted values from our SIR model
fitted_cumulative_incidence <- data.frame(ode(
  y = init, times = Days,
  func = SIR, parms = Opt_par
))

fitted_cumulative_incidence <- fitted_cumulative_incidence %>%
  mutate(
    cumulative_incident_cases = Infected
  )

fitted_cumulative_incidence %>%
  ggplot(aes(x = time)) +
  geom_line(aes(y = I), colour = "red") +
  geom_point(aes(y = cumulative_incident_cases), colour = "blue") +
  labs(
    y = "Cumulative incidence",
    title = "COVID-19 fitted vs observed cumulative incidence, Los Angeles, California",
    subtitle = "(Red = fitted from SIR model, blue = observed)"
  ) +
  theme_minimal()

R0 <- as.numeric(Opt_par[1] / Opt_par[2])
R0
## [1] 1.310345